Information

Results

Samples Duration(sec) Candidates Duplicates Algorithm
10000 0.684 12531 22 LSH Cosine

Imports


In [54]:
import gzip
import tarfile
import numpy as np
import pandas as pd
import h5py as h5
import os
import glob
from sklearn import preprocessing
import math
import time
from scipy.spatial.distance import cosine
from itertools import combinations

Globals

Reading the data

  • data has to be a .h5 data file.
  • data_path should contain the path to this .h5 data file.

In [55]:
# 1 million summary data. Takes long!
data_path = 'msd_summary_file.h5'
# subset(10000 songs) summary data
data_path = 'MillionSongSubset/AdditionalFiles/subset_msd_summary_file.h5'

features = ['duration', 'end_of_fade_in','key', 'loudness', 'mode', 'start_of_fade_out','tempo','time_signature']

debug_print = True

In [56]:
# Comment out, when debugging.
np.random.seed(42)

Helper functions


In [57]:
'''
Reads the data and the feature names and returns the track ids and the feature matrix.
The track_id field from the data is mandatory, therefore it will always be included
Args:
    feature_names(list of strings): list containing the feature names that will be included in the feature matrix.
    data(pandas.io.pytables.HDFStore table): table containing the data.
Returns:
    (numpy.ndarray, numpy.ndarray): matrix(#samples x 1) of track_ids, feature matrix(#samples x #features).
'''
def get_feature_matrix(feature_names,data):
    if 'track_id' in feature_names:
        songs = np.asarray(data['/analysis/songs'][feature_names])
    else:
        songs = np.asarray(data['/analysis/songs'][['track_id'] + feature_names])
    #indices = np.random.choice(range(np.size(songs,0)), sample_size)
    return np.array(songs[:,0]),np.array(songs[:,1:],dtype=np.float64)


'''
Returns a vector with normal distributed {0,1} values.
Args:
    n(int) : size of the vector.
Returns:
    list(int): list of length n of normal distributed {0,1} values.
'''
def get_random_vector(n):
    #rv = np.random.normal(0, 1, n)
    #rv[rv < 0] = -1
    #rv[rv > 0] = 1
    #rv[rv == 0] = np.random.randint(0,2)*2-1
    rv = 2*np.random.randint(0,2,n)-1
    return rv


'''
Returns the cosine of the angle of two given vectors
Args:
    a(numpy.ndarray): vector of real values.
    b(numpy.ndarray): vector of real values.
Returns:
    double: Returns the cosine of the angle of a and b.
'''
def cosine_angle(a,b):
    return np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b))


'''
Returns the cosine distance between two vectors.
Args:
    a(numpy.ndarray): vector of real values.
    b(numpy.ndarray): vector of real values.
Returns:
    double: Returns the cosine distance of a and b.
'''
def cosine_distance(a,b):
    return 1.0-cosine_angle(a,b)


'''
Returns a matrix(#samples x #bands) with hash-values for the different song based on the respective bands.
Args:
    Sketch(numpy.ndarray): Sketch matrix(#samples x #rows*#bands) with values in domain {0,1}.
    r(int): number of rows
    b(int): number of bands
Returns:
    (numpy.ndarray, numpy.ndarray): Returns a matrix(#samples x #bands) with hash-values for the different song based on the respective bands.
'''
def get_hash_bands(Sketch, r,b):
    twos = np.array([1<<i for i in range(r)])
    Hashes = np.zeros((np.size(Sketch,0),b))
    for i in range(b):
        Hashes[:,i] = np.dot(Sketch[:,(r*i):(r*(i+1))], twos)
    return Hashes.astype(np.uint64)

In [58]:
'''
Operations:
- Get data
- Build feature matrix
- 0-1 normalize it

track_ids: matrix(#samples x 1)
feature_matrix: matrix(#samples x #features)
'''
songs = pd.HDFStore(data_path)
track_ids, feature_matrix = get_feature_matrix(features, songs)
feature_matrix = preprocessing.scale(feature_matrix)

if debug_print:
    print("Shape track_ids:",track_ids.shape)
    print("Shape feature matrix:",feature_matrix.shape)


Shape track_ids: (10000,)
Shape feature matrix: (10000, 8)

LSH Cosine Similarity Algorithm


In [59]:
# data and algorithm parameters
'''
D = number of features
N = number of samples
b = number of bands
r = number of rows
eps = angle threshold(degrees)
'''
D = np.size(feature_matrix,1)
N = np.size(feature_matrix, 0)
b = 3
r = 64
eps = 2

In [60]:
'''
Operations:
- Generate matrix of random vectors with values in {-1,1}.

RV: matrix(#bands*#rows x n_features)
'''
RV = np.array([get_random_vector(D) for i in range(b*r)])

if debug_print:
    print("Shape RV:",np.shape(RV))
    print("Random vectors matrix RV:\n",RV)


Shape RV: (192, 8)
Random vectors matrix RV:
 [[-1  1 -1 ...,  1 -1 -1]
 [-1  1 -1 ..., -1  1 -1]
 [ 1  1  1 ..., -1  1  1]
 ..., 
 [-1 -1  1 ..., -1  1 -1]
 [ 1 -1 -1 ..., -1 -1 -1]
 [ 1  1  1 ..., -1 -1  1]]

In [61]:
'''
Operations:
    - Generate sketch matrix, by performing 
Clip sketch matrix to 0-1 range for hashing.
Dimensionality: n_samples x n_bands*n_rows
'''
Sketch = np.dot(feature_matrix, RV.T)
Sketch[Sketch < 0] = -1
Sketch[Sketch > 0] = 1
Sketch[Sketch == 0] = np.random.randint(0,2)*2-1


if debug_print:
    print("Shape Sketch:",Sketch.shape)
    print("Sketch:\n",Sketch)


Shape Sketch: (10000, 192)
Sketch:
 [[-1. -1.  1. ...,  1. -1.  1.]
 [-1.  1. -1. ..., -1.  1. -1.]
 [ 1. -1. -1. ..., -1.  1.  1.]
 ..., 
 [-1. -1. -1. ...,  1.  1.  1.]
 [ 1.  1.  1. ..., -1.  1. -1.]
 [-1. -1. -1. ...,  1. -1.  1.]]

In [62]:
# clip values of Sketch matrix in domain {0,1} to easily hash them.
Sketch[Sketch < 0] = 0


if debug_print:
    print("Shape Binary Sketch:",Sketch.shape)
    print("Binary Sketch:\n",Sketch)


Shape Binary Sketch: (10000, 192)
Binary Sketch:
 [[ 0.  0.  1. ...,  1.  0.  1.]
 [ 0.  1.  0. ...,  0.  1.  0.]
 [ 1.  0.  0. ...,  0.  1.  1.]
 ..., 
 [ 0.  0.  0. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  1.  0.  1.]]

In [63]:
hb = get_hash_bands(Sketch,r, b)

if debug_print:
    print("Shape hb:",hb.shape)
    print("hb:\n",hb)


Shape hb: (10000, 3)
hb:
 [[ 1780874543489514496 10265066964849694720 11640106700832968704]
 [ 4796099063967254528  6978836959429062656  5478326752168664064]
 [15613821186084831232  4672718382042084352 15025905181223591936]
 ..., 
 [ 3709516171680541696  2032026892628070400 17900927777596678144]
 [16609655966554849280 16414193587434909696  5113284333975455744]
 [18167632941649860608 11472516267285336064 12957191469937338368]]

Algorithm Description

  • for each band b we store a hash_table in the dictionary buckets[b].
  • for each band b:
    • for each song s:
      • add song s to the bucket to which it hashes in the dictionary buckets[b].
    • for each key in buckets[b]:
      • if it contains more than one element, it contains candidates for duplication.
        • for all elements in this list, generate all unordered pairs.
        • for each such pair, if it is not contained in the candidates dictionary, add the pair with the corressponding cosine distance.
  • for each pair in candidates:
    • check if its cosine distance < cosine distance of epsilon, meaning we consider it a duplicate.

In [64]:
'''
candidates(dict): Dictionary with key=(song_id,song_id), value=cosine_distance(song_id,song_id)
duplicates(list): List of tuples (songid, songid)
buckets(dict)   : Dictionary with key=band_id, value=dict with key=hash_key, value = list of song_id
'''

candidates = {}
duplicates = []
buckets = { i : {} for i in range(b) }

start = time.time()
for i in range(b):
    for j in range(N):
        hash_key = hb[j,i]
        if hash_key not in buckets[i]:
            buckets[i][hash_key] = []
        buckets[i][hash_key].append(j)
    for candidates_list in buckets[i].values():
        if len(candidates_list) > 1:
            for _i in range(len(candidates_list)):
                for _j in range(_i+1,len(candidates_list)):
                    songA = candidates_list[_i]
                    songB = candidates_list[_j]
                    if (songA,songB) not in candidates:
                        candidates[(songA,songB)] = cosine_distance(feature_matrix[songA,:],feature_matrix[songB,:])


cos_eps_dist = 1-math.cos(math.radians(eps))
for key in candidates.keys():
    if candidates[key] < cos_eps_dist:
        songA = key[0]
        songB = key[1]
        duplicates.append((songA,songB))
        

print("LSH Duration:", time.time() - start,"sec")
print("Nr. candidates:", len(candidates.keys()))
print("Nr. duplicates:",len(duplicates))


LSH Duration: 0.26318907737731934 sec
Nr. candidates: 17643
Nr. duplicates: 22

In [ ]: